library(magrittr) # quando der problema com o "%>%"
library(dplyr) # selecao e filtro de dados
library(geosphere) # localizacao geoespacial
library(lubridate) # datas, funcoes hour, month, wday
library(plotly) # plot dos graficos
library(knitr) # usada pelo plotly
library(dummies) # cria colunas binarias para variaveis categoricas
library(scales) # normaliza dados rescalando para float de 0 a 1
library(randomForest) # cria rede neural para criar regressao de tempo de viagem
library(tidyverse)
library(yaml)
source('preprocessing.R')
source('mapa_calor_ny.R')
read.csv(gzfile("./data_source/train.csv.gz")) %>%
as.data.frame() -> train
head(train, 3)
sum(is.na(train))
## [1] 0
summary(train)
## id vendor_id pickup_datetime
## id0000001: 1 Min. :1.000 2016-01-12 18:48:44: 5
## id0000003: 1 1st Qu.:1.000 2016-02-09 21:03:38: 5
## id0000005: 1 Median :2.000 2016-03-04 08:07:34: 5
## id0000008: 1 Mean :1.535 2016-04-05 18:55:21: 5
## id0000009: 1 3rd Qu.:2.000 2016-05-07 13:18:07: 5
## id0000011: 1 Max. :2.000 2016-06-10 23:17:17: 5
## (Other) :1458638 (Other) :1458614
## dropoff_datetime passenger_count pickup_longitude
## 2016-02-19 19:25:04: 5 Min. :0.000 Min. :-121.93
## 2016-05-16 19:40:28: 5 1st Qu.:1.000 1st Qu.: -73.99
## 2016-01-07 08:04:32: 4 Median :1.000 Median : -73.98
## 2016-01-08 12:43:38: 4 Mean :1.665 Mean : -73.97
## 2016-01-08 13:00:41: 4 3rd Qu.:2.000 3rd Qu.: -73.97
## 2016-01-09 15:59:42: 4 Max. :9.000 Max. : -61.34
## (Other) :1458618
## pickup_latitude dropoff_longitude dropoff_latitude store_and_fwd_flag
## Min. :34.36 Min. :-121.93 Min. :32.18 N:1450599
## 1st Qu.:40.74 1st Qu.: -73.99 1st Qu.:40.74 Y: 8045
## Median :40.75 Median : -73.98 Median :40.75
## Mean :40.75 Mean : -73.97 Mean :40.75
## 3rd Qu.:40.77 3rd Qu.: -73.96 3rd Qu.:40.77
## Max. :51.88 Max. : -61.34 Max. :43.92
##
## trip_duration
## Min. : 1
## 1st Qu.: 397
## Median : 662
## Mean : 959
## 3rd Qu.: 1075
## Max. :3526282
##
set.seed(20)
# Criando uma sequencia de 1 ate a quantidade de total de linhas
linhas.idx <- seq_len(nrow(train))
# Obtendo aleatoriamente 10000 amostras de linhas do dataset
linhas.sample <- sample(linhas.idx, 5000)
# Amostra aleatoria com 10000 dados
df <- train[linhas.sample, ]
head(df, 10)
df$bairro_saida = mapply(define_bairro, df$pickup_longitude, df$pickup_latitude)
df$bairro_chegada = mapply(define_bairro, df$dropoff_longitude, df$dropoff_latitude)
#Distancia em KM
df$dist_euclidiana = dist_eucl(df)
df$dist_manhattan = dist_manh(df)
df$velocidade = df$dist_manhattan / df$trip_duration
df$pickup_hour <- hour(df$pickup_datetime)
df$pickup_month <- month(df$pickup_datetime)
df$pickup_weekdays <- wday(df$pickup_datetime)
df$sen_hour <- sin(df$pickup_hour / 3.4)
df$sen_month <- sin(df$pickup_month / 1.7)
df$sen_week <- sin(df$pickup_weekdays)
plot( sin(seq(from = 1, to = 24/3.4, length.out = 70 )), type = 'o' )
df %>%
filter(df$dist_manhattan > 0.5) -> df
df %>%
filter(df$trip_duration < 10000) -> df
Regioes NY
df %>%
group_by(bairro_saida) %>%
count() -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~n, type = 'bar')
df %>%
group_by(bairro_chegada) %>%
count() -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~n, type = 'bar')
subplot(plot1, plot2, shareY = T)
df %>%
group_by(bairro_saida) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot1 = plot_ly(data= data_plot, x= ~bairro_saida, y= ~velocidade_media, type = 'bar')
df %>%
group_by(bairro_chegada) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot
plot2 = plot_ly(data= data_plot, x= ~bairro_chegada, y= ~velocidade_media, type = 'bar')
subplot(plot1, plot2, shareY = T)
p1 = plot_ly(data= df, x= ~passenger_count, y= ~trip_duration, type = 'scatter', mode = 'markers')
p2 = plot_ly(data= df, x= ~dist_manhattan, y= ~trip_duration, type = 'scatter', mode = 'markers') %>%
layout(title="Correlacao Num. Passageiros vs. Tempo | Correlacao Distancia vs. Tempo")
subplot(p1, p2)
df %>%
group_by(pickup_hour) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~velocidade_media, type = 'scatter', mode='lines')
df %>%
group_by(pickup_weekdays) %>%
summarize(velocidade_media = mean(velocidade),n()) -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~velocidade_media, type = 'scatter', mode='lines') %>%
layout(title="Horas | Dias da Semana")
subplot(plot1, plot2, shareY = T)
df %>%
group_by(pickup_hour) %>%
count() -> data_plot1
plot1 = plot_ly(data= data_plot1, x= ~pickup_hour, y= ~n, type = 'bar')
df %>%
group_by(pickup_weekdays) %>%
count() -> data_plot2
plot2 = plot_ly(data= data_plot2, x= ~pickup_weekdays, y= ~n, type = 'bar') %>%
layout(title="Horas | Dias da Semana")
subplot(plot1, plot2)
heat_map_taxi(train, "pickup")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\Bruno Aquino\Documents\Trabalho R\v3\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields
heat_map_taxi(train, "dropoff")
## OGR data source with driver: ESRI Shapefile
## Source: "C:\Users\Bruno Aquino\Documents\Trabalho R\v3\analise_taxi\data_source\mapa_ny", layer: "geo_export_8661594b-4f67-485f-8af1-84a4bd06054d"
## with 5 features
## It has 4 fields
kmeans_data <- df[, c("trip_duration", "dist_euclidiana")]
boxplot(kmeans_data[, c("trip_duration")], las=1, xlab="trip_duration")
boxplot(kmeans_data[, c("dist_euclidiana")], las=1, xlab=c("dist_euclidiana"))
normalized <-(kmeans_data-min(kmeans_data))/(max(kmeans_data)-min(kmeans_data))
clusters <- kmeans(normalized, centers = 3)
plot(normalized, col=clusters$cluster, pch=21, cex=1)
df %>%
mutate(pickup_time_in_minutes = minute(pickup_datetime) + hour(pickup_datetime) * 60) %>%
mutate(hour_quarter = pickup_time_in_minutes %/% 15) -> df
head(df, 3)
df %>%
group_by(hour_quarter) %>%
summarise(count = n()) -> hour_quarter_freq
plot(hour_quarter_freq, type = "o", main="Grafico de linha temporal por quartos de hora", xlab="Quarto de hora", ylab="Numero de viagens")
df %>%
group_by(pickup_month) %>%
summarise(count = n()) -> month_freq
plot(month_freq, type = "o", main="Grafico de linha temporal mensal", xlab="Mês", ylab="Número de viagens")
df %>%
group_by(pickup_weekdays) %>%
summarise(count = n()) -> weekday_freq
plot(weekday_freq, type = "o", main="Gráfico de linha temporal por dia da semana", xlab="Dia da semana", ylab="Número de viagens")
bairro_dummy = dummy(df$bairro_chegada, sep='_')
df = data.frame(cbind(df, bairro_dummy))
df$dist_manhattan = rescale(df$dist_manhattan)
df$trip_duration = rescale(df$trip_duration)
df$passenger_count = rescale(df$trip_duration)
head(train, 3)
# DANDO PROBLEMA NA GERACAO DO HTML MARKDOWN
#X <- df[c('passenger_count'
# , 'dist_manhattan'
# , 'sen_hour'
# , 'sen_week'
# , 'sen_month'
# , 'bairro_chegada_1'
# , 'bairro_chegada_2'
# , 'bairro_chegada_3'
# , 'bairro_chegada_4'
# , 'bairro_chegada_5'
# , 'bairro_chegada_6'
# , 'bairro_chegada_7'
# , 'bairro_chegada_8'
# , 'bairro_chegada_9')]
#y <- df['trip_duration']